from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Lars, Ridge, ElasticNet, LassoLars, LassoLarsCV, LinearRegression
import re
from umap import UMAP
import requests
import pandas as pd
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import gower
import pickle
from collections import Counter
import plotly.express as px
from xgboost import XGBRFRegressor
import shap
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
# import the real estate price analytics library
from lib.real_estate_analytics_library import *
# optional - suppress warnings
import warnings
warnings.filterwarnings('ignore')
# the root page link is used to generate the links for all pages
root = 'https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A%2210%22%2C%22LocationSearchString%22%3A%22Zurich%22%2C%22RootPropertyTypes%22%3A%5B%220%22%5D%2C%22PriceTo%22%3A%22-10%22%2C%22RoomsFrom%22%3A%22-10%22%2C%22Sort%22%3A%2211%22%2C%22AdAgeMax%22%3A-1%2C%22ComparisPointsMin%22%3A-1%2C%22SiteId%22%3A-1%7D&sort=11&page='
# Open provided link using the requests package
# get the properties in Zürich, using the Comparis link for this result
links_page = requests.get('https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A%2210%22%2C%22LocationSearchString%22%3A%22Zurich%22%2C%22RootPropertyTypes%22%3A%5B%220%22%5D%2C%22PriceTo%22%3A%22-10%22%2C%22RoomsFrom%22%3A%22-10%22%2C%22Sort%22%3A%2211%22%2C%22AdAgeMax%22%3A-1%2C%22ComparisPointsMin%22%3A-1%2C%22SiteId%22%3A-1%7D&sort=11')
soup = BeautifulSoup(links_page.content, 'html.parser')
# get the page number links
links = list(l['href'] for l in soup.find_all("a",{"class":"css-1yj1f35 excbu0j4"}))
# get the number of pages available for the location in question
num_pages = int(links[-2][links[-2].find('page=') + 5:]) + 1
# generate the list of pages that contain properties for the location in question
property_links = [root + str(i) for i in range(0, num_pages, 1)]
# define the root that we will comine with the property ID, giving us the page for each property
root = 'https://en.comparis.ch/immobilien/marktplatz/details/show/'
# define the list for storing the specific page for each property
pages = []
property_id = []
for property_link in property_links:
page = requests.get(property_link)
soup = BeautifulSoup(page.content, 'html.parser')
raw_id_list = re.findall(r'"AdId":[-+]?[0-9]+,', str(soup))
id_list = [raw_id[raw_id.find(':') + 1:raw_id.find(',')] for raw_id in raw_id_list]
# comine the root with the property ID, giving us the page for each property
property_id.extend(id_list)
pages.extend([root + i for i in id_list])
# get the attributes for each property from the Comparis website
properties = []
for p in pages:
page = requests.get(p)
soup = BeautifulSoup(page.content, 'html.parser')
property_address = list(soup.find("h3",{"class":"text-green"}))
property_attributes = list(soup.find("dl",{"class":"row xsmall-up-2 medium-up-3 large-up-4 attributes-grid"}).stripped_strings)
properties.append([property_address, property_attributes])
# check the length of the property attributes list
len(properties)
# define the list of attributes that will be gathered from the scraped data
property_type = []
gross_rent = []
net_rent = []
living_space = []
rooms = []
floor = []
available_date = []
public_transport = []
motorway = []
shop = []
# flatten the property address list
property_address = [record[0][0] for record in properties]
# cycle through the scraped property data and separate it into attribute-based lists that will be used to
# create a pandas DataFrame
for record in properties:
try:
property_type.append(record[1][record[1].index('Property type') + 1])
except:
property_type.append(None)
try:
gross_rent.append(float(record[1][record[1].index('Rent per month') + 1][4:].replace(',','')))
except:
gross_rent.append(None)
try:
net_rent.append(float(record[1][record[1].index('Rent per month (without charges)') + 1][4:].replace(',','')))
except:
net_rent.append(None)
try:
living_space.append(float(record[1][record[1].index('Living space') + 1][:-3]))
except:
living_space.append(None)
try:
rooms.append(get_num_rooms(record[1][record[1].index('Rooms') + 1]))
except:
rooms.append(None)
try:
floor.append(record[1][record[1].index('Floor') + 1])
except:
floor.append(None)
try:
available_date.append(record[1][record[1].index('Available') + 1])
except:
available_date.append(None)
try:
public_transport.append(float(record[1][record[1].index('Public transport stop') + 1][:-2]))
except:
public_transport.append(None)
try:
motorway.append(float(record[1][record[1].index('Motorway') + 1][:-2]))
except:
motorway.append(None)
try:
shop.append(float(record[1][record[1].index('Shops') + 1][:-2]))
except:
shop.append(None)
property_id = [int(i) for i in property_id]
property_records = pd.DataFrame(list(zip(property_id, property_address, property_type, gross_rent, net_rent, living_space, rooms, floor, available_date, public_transport, motorway, shop)), columns =['property_id', 'property_address', 'property_type', 'gross_rent', 'net_rent', 'living_space', 'rooms', 'floor', 'available_date', 'public_transport', 'motorway', 'shop'])
# attempt to load the previous records, and combine them with the new ones
try:
# load the previous records
previous_property_records = pd.read_csv('data/property_records_rent.csv')
# concatenate the new records and the previous records
property_records = pd.concat([property_records, previous_property_records], axis=0).drop(columns=['Unnamed: 0'])
# drop duplicate records
property_records = property_records.drop_duplicates(subset=['property_id'], keep='last').reset_index(drop=True)
except:
pass
# show records
property_records
# save the scraped property records
property_records.to_csv('data/property_records_rent.csv')
In this section, we process the scraped web data. This involves encoding all features as the appropriate data type and performing imputation (i.e. encoding missing data points as the mean, median or mode of the existing data.
# load data
property_records = pd.read_csv('data/property_records_rent.csv')
# display the ratio of missing values for the below features
print('gross_rent:', property_records.loc[property_records['gross_rent'].isna() == True].shape[0]/property_records.shape[0])
print('living_space:', property_records.loc[property_records['living_space'].isna() == True].shape[0]/property_records.shape[0])
print('rooms:', property_records.loc[property_records['rooms'].isna() == True].shape[0]/property_records.shape[0])
print('property_address:', property_records.loc[property_records['property_address'].isna() == True].shape[0]/property_records.shape[0])
print('floor:', property_records.loc[property_records['floor'].isna() == True].shape[0]/property_records.shape[0])
print('property_type:', property_records.loc[property_records['property_type'].isna() == True].shape[0]/property_records.shape[0])
print('shop:', property_records.loc[property_records['shop'].isna() == True].shape[0]/property_records.shape[0])
print('public_transport:', property_records.loc[property_records['public_transport'].isna() == True].shape[0]/property_records.shape[0])
print('motorway:', property_records.loc[property_records['motorway'].isna() == True].shape[0]/property_records.shape[0])
# process the data for use in a price prediction model, pricing analytics
property_records = process_records(property_records)
# save the processed property records
property_records.to_csv('data/processed_property_records_rent.csv')
# save the possible values for each feature
with open('data/possible_postcodes.pickle', 'wb') as handle:
pickle.dump(list(property_records['property_postcode'].unique()), handle)
with open('data/possible_floors.pickle', 'wb') as handle:
pickle.dump(list(property_records['floor'].unique()), handle)
with open('data/possible_types.pickle', 'wb') as handle:
pickle.dump(list(property_records['property_type'].unique()), handle)
In this section we will select, train and save two models - one tree-based model, and one linear regression-based model. The tree-based model will be selected because it has a lower mean absolute error, while the linear regression-based model will be used to extrapolate the price of real estate that falls outside of the range of the training data (i.e. very high-value real estate), since tree-based models cannot predict values that are higher than the highest target value in the dataset on which they are trained.
Note: the linear model assumes that there is a linear relationship between price and other features such as living space and number of rooms for larger properties outside of the dataset.
The methodology used in this Jupyter notebook assumes stability in the price data for the records that were scraped - that is, we assume that the prices did not significantly change over the time period covered by the property listings.
# load data
property_records = pd.read_csv('data/processed_property_records_rent.csv')
x = property_records[[col for col in property_records.columns if col not in ['property_id', 'gross_rent', 'net_rent', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]]
y = property_records['gross_rent']
fig = px.scatter(property_records, x="rooms", y="gross_rent", color="property_type", title="Gross Rent vs Number of Rooms", hover_data=['property_postcode'])
fig.show()
fig = px.box(property_records, x="rooms", y="gross_rent", title="Gross Rent vs Number of Rooms", points=False)
fig.show()
fig = px.scatter(property_records, x="living_space", y="gross_rent", color="property_type", title="Gross Rent vs Living Space", hover_data=['property_postcode'])
fig.show()
fig = px.box(property_records, x="property_postcode", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
fig = px.box(property_records, x="floor", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
fig = px.box(property_records, x="property_type", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
# scale the float features
columns = ['living_space', 'rooms', 'public_transport', 'motorway', 'shop']
scaler = StandardScaler().fit(x[columns])
scaled = scaler.transform(x[columns])
scaled = pd.DataFrame(scaled, columns=['scaled_' + column for column in columns])
x = pd.concat([x, scaled], axis=1)
# save the scaler model for later use
with open('data/scaler.pickle', 'wb') as handle:
pickle.dump(scaler, handle)
x = x.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop'])
# calculate the correlation_matrix matrix of the features and the dependent variable
correlation_matrix = property_records[[col for col in property_records.columns if col not in ['net_rent', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]].corr().loc[['gross_rent']].drop(['gross_rent'], axis=1)
# visualize the correlation_matrix matrix
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(correlation_matrix, square=True, vmin=-1, vmax=1, ax=ax, linewidths=1, xticklabels=correlation_matrix.columns, cmap="Blues")
plt.yticks(rotation=0)
plt.show()
get_vifs(x)
The above VIFs indicate, as expected, serious multicolinearty in the data. This is because of the the one hot encoding of the categorical data. In order to fix this problem, we can eliminate a column from each of the categorical feature sets. We will select the columns below, based on their frequency in the data. This should not result in any significant loss in the performance of the model, as the removed values will still be indicated in the data (because all of the remaining columns/features will be 0 if the removed value is present). For example, if we remove the 'Apartment' encoding, then any record for an apartment will have all other property_type encoding set to 0 (e.g features such as 'Single garage' will all be equal to 0).
sorted(Counter(property_records['property_postcode']).items(), key=lambda v: v[1], reverse=True)
sorted(Counter(property_records['floor']).items(), key=lambda v: v[1], reverse=True)
sorted(Counter(property_records['property_type']).items(), key=lambda v: v[1], reverse=True)
# define the columns that are to be eliminated from the input features to the Linear Regression model. This is to
# eliminated multicolinearity.
eliminated_columns = ['8001', '1. floor', 'Apartment']
# The below VIFs for the reduced data indicate no multicolinearity.
get_vifs(x.drop(columns=eliminated_columns))
# save the list of eliminated columns for later use
with open('data/eliminated_columns.pickle', 'wb') as handle:
pickle.dump(eliminated_columns, handle)
# remove the outliers detected by Tukey's test - this reduced dataset will be used in the training of the linear
# models
xe, ye = remove_outliers_tukeys_test(x.drop(columns=eliminated_columns), y)
# use the Gower distance to scale the data for input into UMAP dimensionality-reduction, which takes into account
# the float inputs and their interaction with the one hot-encoded data
umap_results = UMAP(n_neighbors=20).fit_transform(gower.gower_matrix(pd.concat([y, x], axis=1)))
outlier_indices = get_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
normal_indices = [i for i in range(0, x.shape[0], 1) if i not in outlier_indices]
outliers = pd.DataFrame(zip([v[0] for v in umap_results[outlier_indices]], [v[1] for v in umap_results[outlier_indices]], ['Outlier' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
normal = pd.DataFrame(zip([v[0] for v in umap_results[normal_indices]], [v[1] for v in umap_results[normal_indices]], ['Normal' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
# save the UMAP results as a pandas DataFrame
umap_data = pd.concat([normal, outliers]).reset_index(drop=True)
# plot the UMAP results, showing the outliers vs normal data points, based on the isolation forest model
fig = px.scatter(umap_data, x="Dimension 1", y="Dimension 2", color="Status", title="UMAP Result", hover_data=[umap_data.index.values])
fig.show()
# remove the outliers detected by the isolation forest - this reduced dataset will be used in the training of the
# tree-based models
xt, yt = remove_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
model_types = [['Lasso', Lasso()], ['Ridge', Ridge()], ['ElasticNet', ElasticNet()], ['LassoLars', LassoLars()], ['LassoLarsCV', LassoLarsCV()], ['Lars', Lars()], ['LinearRegression', LinearRegression()]]
model_results = train_model(xe, ye, model_types, 5)
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
top_models
# train the best model on the expanded dataset
linear_pricing_model = model_results[0][1].fit(xe, ye)
linear_pricing_model_mae = top_models[0][4]
linear_pricing_model
# save the selected model
with open('models/linear_pricing_model.pickle', 'wb') as handle:
pickle.dump(linear_pricing_model, handle)
# save the model's MAE
with open('models/linear_pricing_model_mae.pickle', 'wb') as handle:
pickle.dump(linear_pricing_model_mae, handle)
# calculate feature importances based on the regression coefficients
regression_interpretation = pd.DataFrame(sorted(list(zip(xe.columns, linear_pricing_model.coef_)), key=lambda v: abs(v[1]), reverse=False), columns=['Feature', 'Weight'])
# plot the regression corefficient-based feature importances
fig = px.scatter(regression_interpretation, x="Weight", y="Feature")
fig.update_yaxes(type='category')
fig.show()
model_types = [['XGBRFRegressor', XGBRFRegressor()], ['AdaBoostRegressor', AdaBoostRegressor()], ['RandomForestRegressor', RandomForestRegressor()], ['ExtraTreesRegressor', ExtraTreesRegressor()], ['DecisionTreeRegressor', DecisionTreeRegressor()], ['GradientBoostingRegressor', GradientBoostingRegressor()]]
model_results = train_model(xt, yt, model_types, 3)
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
top_models
# train the best model on the expanded dataset
pricing_model = top_models[0][1].fit(xt, yt)
pricing_model_mae = top_models[0][4]
pricing_model
# save the selected model
with open('models/pricing_model.pickle', 'wb') as handle:
pickle.dump(pricing_model, handle)
# save the model's MAE
with open('models/pricing_model_mae.pickle', 'wb') as handle:
pickle.dump(pricing_model_mae, handle)
# calculate and show the raw SHAP values for the model
# reference: https://christophm.github.io/interpretable-ml-book/shap.html
# load JS visualization code to notebook
shap.initjs()
explainer = shap.TreeExplainer(pricing_model)
shap_values = explainer.shap_values(xt)
shap.summary_plot(shap_values, xt)
# show the SHAP value-based relative model feature importances
shap.summary_plot(shap_values, xt, plot_type="bar")
# show the possible values for each feature
with open('data/possible_postcodes.pickle', 'rb') as handle:
print('Possible Postcodes =', pickle.load(handle))
print('')
with open('data/possible_floors.pickle', 'rb') as handle:
print('Possible Floors =', pickle.load(handle))
print('')
with open('data/possible_types.pickle', 'rb') as handle:
print('Possible Property Types =', pickle.load(handle))
# load data
property_records = pd.read_csv('data/processed_property_records_rent.csv')
# load the pre-trained models and other required data from pickle files
with open('models/pricing_model.pickle', 'rb') as handle:
pricing_model = pickle.load(handle)
with open('models/pricing_model_mae.pickle', 'rb') as handle:
pricing_model_mae = pickle.load(handle)
with open('models/linear_pricing_model.pickle', 'rb') as handle:
linear_pricing_model = pickle.load(handle)
with open('models/linear_pricing_model_mae.pickle', 'rb') as handle:
linear_pricing_model_mae = pickle.load(handle)
with open('data/eliminated_columns.pickle', 'rb') as handle:
eliminated_columns = pickle.load(handle)
with open('data/scaler.pickle', 'rb') as handle:
scaler = pickle.load(handle)
with open('data/encoder.pickle', 'rb') as handle:
encoder = pickle.load(handle)
# define the feature values for the property
living_space = 140
rooms = 5.0
postcode = '8003'
floor = '1. floor'
property_type = 'Apartment'
public_transport = 100
motorway = 100
shop = 100
input_values = encode_input(living_space, rooms, postcode, floor, property_type, public_transport, motorway, shop, scaler, encoder)
input_values
# use one of: [regression_model, tree_model]
model_type = 'tree_model'
# calculate price
if model_type == 'regression_model':
price = linear_pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type'] + eliminated_columns))[0]
mae = linear_pricing_model_mae
else:
price = pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type']))[0]
mae = pricing_model_mae
calculated_price = pd.concat([pd.DataFrame([price], columns=['gross_rent']), input_values], axis=1)
calculated_price
print('Predicted Price =', price, '+/-', mae, 'CHF')
print('Price Range =', price - mae, 'to', price + mae, 'CHF')
# the predicted price of the property is shown as a red cross, and is plotted alongside properties that are in
# it's peer group (i.e. properties that have the same number of rooms and the same property type)
fig = px.scatter(property_records[(property_records['rooms'] == rooms) & (property_records['property_type'] == property_type)], x="living_space", y="gross_rent", color="property_type", hover_data=['living_space'])
fig1 = px.scatter(calculated_price, x="living_space", y="gross_rent", title="Calculated Price vs Peer Group", hover_data=['property_postcode'])
fig1.update_traces(marker=dict(size=10, color='Red', symbol='x'))
fig.add_trace(fig1.data[0])
fig.show()